Some additional data wrangling. Making one final time-series dataset which includes only the covid-related features, all time-series observations with the associated coordinates and class designations from the PCA.
no_covid <- read.csv("E:\\ProjectData\\unacast\\post-pca.csv")
covid <- read.csv("E:\\ProjectData\\unacast\\covid_ts.csv")
dims <- no_covid %>% dplyr::select(FIPS, Dim.1,Dim.2,class2,polyname)
ts_data <- inner_join(covid,dims,by = 'FIPS') %>% dplyr::select(-X)
ts_data$date <- as.Date(ts_data$date)
maps::county.fips %>%
as_tibble %>%
extract(polyname, c("region", "subregion"), "^([^,]+),([^,]+)$") -> dfips
map_data("county") %>%
left_join(dfips) -> data
## Joining, by = c("region", "subregion")
Cool looking, but not very helpful…
plot_ly(dims, x = ~Dim.1, y = ~Dim.2, z = ~no_covid$Median.Household.Income,
marker = list(color = ~no_covid$Median.Household.Income, colorscale = c("Viridis"), showscale = TRUE)) %>%
add_markers() %>%
layout(
title = 'Counties by PCA Coordinates',
scene = list(xaxis = list(title = 'Dim 1'),
yaxis = list(title = 'Dim 2'),
zaxis = list(title = 'MHHI')))
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
Fitting a linear regression model to see which variables are significant for predicting the metrics we care about before covid
Creating a subset based on a single day (2/24/2020)
day <- ts_data %>% filter(date == as.Date("2020-02-24"))
features <- day %>% dplyr::select(-c("FIPS","covid","date","county_name","state_code","polyname","week"))
vis_only <- features %>% dplyr::select(-c("daily_distance_diff","encounters_rate"))
dist_only <- features %>% dplyr::select(-c("daily_visitation_diff","encounters_rate"))
encounter_only <- features %>% dplyr::select(-c("daily_distance_diff","daily_visitation_diff"))
Basic LM - each of the three for 2/24/2020
#Visitation only
model_vis <- lm(daily_visitation_diff ~., data = vis_only)
summary(model_vis)
##
## Call:
## lm(formula = daily_visitation_diff ~ ., data = vis_only)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.49531 -0.03238 0.01442 0.02863 0.58661
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0588474 0.0042707 -13.779 < 2e-16 ***
## cases 0.0051519 0.0039278 1.312 0.19
## deaths NA NA NA NA
## death_rate NA NA NA NA
## death_per_capita NA NA NA NA
## Dim.1 0.0045229 0.0004901 9.228 < 2e-16 ***
## Dim.2 -0.0067339 0.0011163 -6.032 1.81e-09 ***
## class2 0.0047926 0.0008216 5.834 6.01e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06529 on 3002 degrees of freedom
## Multiple R-squared: 0.04571, Adjusted R-squared: 0.04444
## F-statistic: 35.95 on 4 and 3002 DF, p-value: < 2.2e-16
#Distance only
model_dist <- lm(daily_distance_diff ~., data = dist_only)
summary(model_dist)
##
## Call:
## lm(formula = daily_distance_diff ~ ., data = dist_only)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.38959 -0.03679 -0.00357 0.03311 0.64803
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0007875 0.0046599 0.169 0.865817
## cases 0.0003569 0.0042858 0.083 0.933645
## deaths NA NA NA NA
## death_rate NA NA NA NA
## death_per_capita NA NA NA NA
## Dim.1 0.0019696 0.0005348 3.683 0.000235 ***
## Dim.2 -0.0026702 0.0012180 -2.192 0.028440 *
## class2 0.0021450 0.0008964 2.393 0.016784 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07124 on 3002 degrees of freedom
## Multiple R-squared: 0.007572, Adjusted R-squared: 0.006249
## F-statistic: 5.726 on 4 and 3002 DF, p-value: 0.0001373
#Encounter rate only
model_encounter <- lm(encounters_rate ~., data = encounter_only)
summary(model_encounter)
##
## Call:
## lm(formula = encounters_rate ~ ., data = encounter_only)
##
## Residuals:
## Min 1Q Median 3Q Max
## -689.4 -7.0 1.9 10.5 4606.3
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -36.1280 5.8083 -6.220 5.66e-10 ***
## cases -12.0736 5.3420 -2.260 0.0239 *
## deaths NA NA NA NA
## death_rate NA NA NA NA
## death_per_capita NA NA NA NA
## Dim.1 -10.4523 0.6666 -15.680 < 2e-16 ***
## Dim.2 -16.4602 1.5182 -10.842 < 2e-16 ***
## class2 7.9907 1.1174 7.151 1.07e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 88.79 on 3002 degrees of freedom
## Multiple R-squared: 0.1018, Adjusted R-squared: 0.1007
## F-statistic: 85.1 on 4 and 3002 DF, p-value: < 2.2e-16
Creating a subset based on a single day (3/08/2020)
day <- ts_data %>% filter(date == as.Date("2020-03-08"))
features <- day %>% dplyr::select(-c("FIPS","covid","date","county_name","state_code","polyname","week"))
vis_only <- features %>% dplyr::select(-c("daily_distance_diff","encounters_rate"))
dist_only <- features %>% dplyr::select(-c("daily_visitation_diff","encounters_rate"))
encounter_only <- features %>% dplyr::select(-c("daily_distance_diff","daily_visitation_diff"))
Basic LM - each of the three for 3/08/2020
#Visitation only
model_vis <- lm(daily_visitation_diff ~., data = vis_only)
summary(model_vis)
##
## Call:
## lm(formula = daily_visitation_diff ~ ., data = vis_only)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.54732 -0.01780 -0.00307 0.02298 0.52668
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.716e-02 4.904e-03 -3.498 0.000475 ***
## cases 9.774e-05 7.521e-04 0.130 0.896610
## deaths 4.482e-03 7.041e-03 0.637 0.524496
## death_rate 1.212e-01 1.106e-01 1.095 0.273476
## death_per_capita -2.132e+04 1.196e+04 -1.783 0.074711 .
## Dim.1 5.984e-03 5.825e-04 10.273 < 2e-16 ***
## Dim.2 -4.005e-03 1.282e-03 -3.124 0.001798 **
## class2 2.150e-03 9.432e-04 2.280 0.022686 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07495 on 2999 degrees of freedom
## Multiple R-squared: 0.04543, Adjusted R-squared: 0.0432
## F-statistic: 20.39 on 7 and 2999 DF, p-value: < 2.2e-16
#Distance only
model_dist <- lm(daily_distance_diff ~., data = dist_only)
summary(model_dist)
##
## Call:
## lm(formula = daily_distance_diff ~ ., data = dist_only)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.44060 -0.04789 0.00305 0.04832 0.99506
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.691e-03 6.123e-03 0.603 0.5466
## cases -2.413e-03 9.390e-04 -2.570 0.0102 *
## deaths 7.915e-03 8.791e-03 0.900 0.3680
## death_rate 1.364e-01 1.381e-01 0.987 0.3235
## death_per_capita -1.362e+04 1.493e+04 -0.912 0.3619
## Dim.1 -1.393e-03 7.273e-04 -1.915 0.0556 .
## Dim.2 1.749e-03 1.600e-03 1.093 0.2745
## class2 -4.925e-03 1.178e-03 -4.182 2.97e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09358 on 2999 degrees of freedom
## Multiple R-squared: 0.01364, Adjusted R-squared: 0.01134
## F-statistic: 5.925 on 7 and 2999 DF, p-value: 7.339e-07
#Encounter rate only
model_encounter <- lm(encounters_rate ~., data = encounter_only)
summary(model_encounter)
##
## Call:
## lm(formula = encounters_rate ~ ., data = encounter_only)
##
## Residuals:
## Min 1Q Median 3Q Max
## -252.60 -2.74 0.62 3.67 1440.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.088e+01 1.891e+00 -5.753 9.67e-09 ***
## cases -1.272e+00 2.899e-01 -4.388 1.18e-05 ***
## deaths 1.971e+00 2.714e+00 0.726 0.468
## death_rate -3.667e+00 4.264e+01 -0.086 0.931
## death_per_capita 1.165e+05 4.610e+06 0.025 0.980
## Dim.1 -4.417e+00 2.246e-01 -19.668 < 2e-16 ***
## Dim.2 -5.651e+00 4.941e-01 -11.438 < 2e-16 ***
## class2 2.509e+00 3.636e-01 6.900 6.33e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.89 on 2999 degrees of freedom
## Multiple R-squared: 0.1417, Adjusted R-squared: 0.1396
## F-statistic: 70.7 on 7 and 2999 DF, p-value: < 2.2e-16
Here I fitted the same linear regression on each of the three adherence metrics to see which variables are significant for two randomly selected dates post covid…
Creating a subset based on a single day (4/1/2020)
day <- ts_data %>% filter(date == as.Date("2020-04-01"))
features <- day %>% dplyr::select(-c("FIPS","covid","date","county_name","state_code","polyname","week"))
vis_only <- features %>% dplyr::select(-c("daily_distance_diff","encounters_rate"))
dist_only <- features %>% dplyr::select(-c("daily_visitation_diff","encounters_rate"))
encounter_only <- features %>% dplyr::select(-c("daily_distance_diff","daily_visitation_diff"))
Basic LM - each of the three for 4/01/2020
#Visitation only
model_vis <- lm(daily_visitation_diff ~., data = vis_only)
summary(model_vis)
##
## Call:
## lm(formula = daily_visitation_diff ~ ., data = vis_only)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7256 -0.1369 -0.0087 0.1546 0.8489
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.986e-01 1.245e-02 -40.051 <2e-16 ***
## cases 2.917e-05 1.324e-05 2.203 0.0277 *
## deaths 9.861e-04 7.870e-04 1.253 0.2103
## death_rate -6.764e-02 5.789e-02 -1.169 0.2427
## death_per_capita -2.902e+02 1.763e+02 -1.646 0.0998 .
## Dim.1 5.668e-02 1.622e-03 34.942 <2e-16 ***
## Dim.2 -5.340e-02 3.288e-03 -16.241 <2e-16 ***
## class2 4.168e-02 2.391e-03 17.431 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1894 on 2999 degrees of freedom
## Multiple R-squared: 0.4086, Adjusted R-squared: 0.4073
## F-statistic: 296.1 on 7 and 2999 DF, p-value: < 2.2e-16
#Distance only
model_dist <- lm(daily_distance_diff ~., data = dist_only)
summary(model_dist)
##
## Call:
## lm(formula = daily_distance_diff ~ ., data = dist_only)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.53484 -0.07976 -0.00942 0.06862 0.70076
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.886e-01 9.030e-03 -31.963 < 2e-16 ***
## cases -1.037e-05 9.606e-06 -1.080 0.2804
## deaths -5.911e-04 5.709e-04 -1.035 0.3006
## death_rate 6.604e-02 4.199e-02 1.573 0.1159
## death_per_capita -2.700e+02 1.279e+02 -2.111 0.0348 *
## Dim.1 2.156e-02 1.177e-03 18.319 < 2e-16 ***
## Dim.2 -2.199e-02 2.385e-03 -9.219 < 2e-16 ***
## class2 7.915e-03 1.735e-03 4.564 5.23e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1374 on 2999 degrees of freedom
## Multiple R-squared: 0.1819, Adjusted R-squared: 0.18
## F-statistic: 95.28 on 7 and 2999 DF, p-value: < 2.2e-16
#Encounter rate only
model_encounter <- lm(encounters_rate ~., data = encounter_only)
summary(model_encounter)
##
## Call:
## lm(formula = encounters_rate ~ ., data = encounter_only)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.970 -0.307 0.023 0.264 62.130
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.065e+00 1.164e-01 -9.150 < 2e-16 ***
## cases -1.471e-04 1.238e-04 -1.189 0.235
## deaths -3.681e-02 7.358e-03 -5.003 5.97e-07 ***
## death_rate -3.790e-01 5.412e-01 -0.700 0.484
## death_per_capita 1.643e+03 1.648e+03 0.997 0.319
## Dim.1 -4.161e-01 1.517e-02 -27.434 < 2e-16 ***
## Dim.2 -3.402e-01 3.074e-02 -11.068 < 2e-16 ***
## class2 1.025e-01 2.235e-02 4.587 4.69e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.771 on 2999 degrees of freedom
## Multiple R-squared: 0.2378, Adjusted R-squared: 0.236
## F-statistic: 133.6 on 7 and 2999 DF, p-value: < 2.2e-16
Creating a subset based on a single day (5/17/2020)
day <- ts_data %>% filter(date == as.Date("2020-05-17"))
features <- day %>% dplyr::select(-c("FIPS","covid","date","county_name","state_code","polyname","week"))
vis_only <- features %>% dplyr::select(-c("daily_distance_diff","encounters_rate"))
dist_only <- features %>% dplyr::select(-c("daily_visitation_diff","encounters_rate"))
encounter_only <- features %>% dplyr::select(-c("daily_distance_diff","daily_visitation_diff"))
Basic LM - each of the three for 5/17/2020
#Visitation only
model_vis <- lm(daily_visitation_diff ~., data = vis_only)
summary(model_vis)
##
## Call:
## lm(formula = daily_visitation_diff ~ ., data = vis_only)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.89200 -0.09174 -0.01790 0.07898 1.23437
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.808e-01 1.408e-02 -12.845 < 2e-16 ***
## cases 1.757e-05 5.513e-06 3.188 0.00145 **
## deaths -1.279e-04 9.313e-05 -1.373 0.16986
## death_rate 1.880e-02 7.175e-02 0.262 0.79330
## death_per_capita -1.031e+02 2.081e+01 -4.956 7.60e-07 ***
## Dim.1 6.200e-02 1.924e-03 32.222 < 2e-16 ***
## Dim.2 -2.351e-02 3.696e-03 -6.363 2.28e-10 ***
## class2 1.927e-02 2.655e-03 7.256 5.07e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2068 on 2999 degrees of freedom
## Multiple R-squared: 0.3611, Adjusted R-squared: 0.3596
## F-statistic: 242.2 on 7 and 2999 DF, p-value: < 2.2e-16
#Distance only
model_dist <- lm(daily_distance_diff ~., data = dist_only)
summary(model_dist)
##
## Call:
## lm(formula = daily_distance_diff ~ ., data = dist_only)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52203 -0.09451 -0.01384 0.06880 1.75304
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.573e-01 1.110e-02 -14.176 < 2e-16 ***
## cases 9.364e-06 4.346e-06 2.155 0.031268 *
## deaths -1.043e-04 7.341e-05 -1.421 0.155503
## death_rate -1.368e-01 5.656e-02 -2.418 0.015648 *
## death_per_capita -3.994e+01 1.641e+01 -2.434 0.014971 *
## Dim.1 2.984e-02 1.517e-03 19.675 < 2e-16 ***
## Dim.2 -9.262e-03 2.913e-03 -3.179 0.001490 **
## class2 7.740e-03 2.093e-03 3.698 0.000221 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.163 on 2999 degrees of freedom
## Multiple R-squared: 0.1812, Adjusted R-squared: 0.1793
## F-statistic: 94.79 on 7 and 2999 DF, p-value: < 2.2e-16
#Encounter rate only
model_encounter <- lm(encounters_rate ~., data = encounter_only)
summary(model_encounter)
##
## Call:
## lm(formula = encounters_rate ~ ., data = encounter_only)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.154 -0.723 0.044 0.651 138.405
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.189e+00 2.829e-01 -4.204 2.70e-05 ***
## cases -6.744e-04 1.108e-04 -6.087 1.29e-09 ***
## deaths 4.814e-03 1.871e-03 2.572 0.0101 *
## death_rate -3.214e+00 1.442e+00 -2.229 0.0259 *
## death_per_capita 3.337e+02 4.182e+02 0.798 0.4250
## Dim.1 -1.038e+00 3.866e-02 -26.860 < 2e-16 ***
## Dim.2 -8.405e-01 7.426e-02 -11.318 < 2e-16 ***
## class2 2.788e-01 5.335e-02 5.226 1.85e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.155 on 2999 degrees of freedom
## Multiple R-squared: 0.2291, Adjusted R-squared: 0.2273
## F-statistic: 127.3 on 7 and 2999 DF, p-value: < 2.2e-16
Here, it looks like I may have 3 different distributions of the response variables, this suggests that I may need to consider different formula families for the GLM to model them properly. It may also be the case that they are all poisson distributed.
#visualizing the distribution of the response variables to help decide the distribution
#Normal-ish? Higher lambda
vis_only_nontrivial <- vis_only %>% filter(daily_visitation_diff != 0)
vis_only_nontrivial %>% ggplot(aes(x = daily_visitation_diff)) + geom_histogram(bins = 50)
#Skew right - Lower lambda
dist_only_nontrivial <- dist_only %>% filter(daily_distance_diff != 0)
dist_only_nontrivial %>% ggplot(aes(x = daily_distance_diff)) + geom_histogram(bins = 50)
#Very skew right - very low lambda
encounter_only_nontrivial <- encounter_only %>% filter(encounters_rate != 0)
encounter_only_nontrivial %>% ggplot(aes(x = encounters_rate)) + geom_histogram(bins = 50)
Turns out it isnt poisson, I should have caught the negative values… So, I ran this, and it looks logistic.
dist <- descdist(vis_only$daily_visitation_diff, discrete = FALSE)
ft <- fitdist(vis_only$daily_visitation_diff, distr = "logis" )
ft
## Fitting of the distribution ' logis ' by maximum likelihood
## Parameters:
## estimate Std. Error
## location -0.07818859 0.004320910
## scale 0.13821661 0.002148809
denscomp(ft)
This still doesnt work…
#
# vis_glm <- glm(daily_visitation_diff~., family=gaussian("log"), data= vis_only)
# summary(vis_glm)